In [4]:
from smod_wrapper import SMoDWrapper
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import roc_auc_score
import datetime
import time
In [5]:
from eden.util import configure_logging
import logging
logger = logging.getLogger()
configure_logging(logger,verbosity=1)
In [6]:
import random
def random_string(length,alphabet_list):
rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
return rand_str
def perturb(seed,alphabet_list,p=0.5):
seq=''
for c in seed:
if random.random() < p: c = random.choice(alphabet_list)
seq += c
return seq
def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6,
sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
random_state=1):
random.seed(random_state)
alphabet_list=[c for c in alphabet]
if motives is None:
motives=[]
for i in range(n_motives):
motives.append(random_string(motif_length,alphabet_list))
else:
motif_length = len(motives[0])
n_motives = len(motives)
sequence_length = sequence_length / len(motives)
flanking_length = (sequence_length - motif_length ) / 2
n_seq_per_motif = n_sequences
counter=0
seqs=[]
for i in range(n_seq_per_motif):
total_seq = ''
total_binary_seq=''
for j in range(n_motives):
left_flanking = random_string(flanking_length,alphabet_list)
right_flanking = random_string(flanking_length,alphabet_list)
noisy_motif = perturb(motives[j],alphabet_list,p)
seq = left_flanking + noisy_motif + right_flanking
total_seq += seq
seqs.append(('ID%d'%counter,total_seq))
counter += 1
binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
binary_seq = binary_skeleton * n_motives
return motives, seqs, binary_seq
In [7]:
def score_seqs(seqs, n_motives, tool):
scores = []
if tool is None:
return scores
for j in range(len(seqs)):
seq_scr = []
iters = tool.nmotifs
for k in range(iters):
scr=tool.score(motif_num=k+1, seq=seqs[j][1])
seq_scr.append(scr)
# taking average over all motives for a sequence
if len(seq_scr) > 1:
x = np.array(seq_scr[0])
for l in range(1, iters):
x = np.vstack((x, seq_scr[l]))
seq_scr = list(np.mean(x, axis=0))
scores.append(seq_scr)
elif len(seq_scr) == 1:
scores.append(np.array(seq_scr[0]))
else:
raise ValueError("no sequence score")
return scores
In [8]:
def get_dataset(sequence_length=200,
n_sequences=200,
motif_length=10,
n_motives=2,
p=0.2,
random_state=1):
motives, pos_seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
sequence_length=sequence_length,
n_sequences=n_sequences,
motif_length=motif_length,
n_motives=n_motives,
p=p,
random_state=random_state)
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=2, order=2)
neg_seqs = list(neg_seqs)
block_size=n_sequences/8
pos_size = len(pos_seqs)
train_pos_seqs = pos_seqs[:pos_size/2]
test_pos_seqs = pos_seqs[pos_size/2:]
neg_size = len(neg_seqs)
train_neg_seqs = neg_seqs[:neg_size/2]
test_neg_seqs = neg_seqs[neg_size/2:]
true_score = [float(int(i)) for i in binary_seq]
return (block_size, train_pos_seqs, train_neg_seqs, test_pos_seqs, n_motives, true_score)
In [9]:
def test_on_datasets(n_sets = 5, param_setting=None, p=0.2, max_roc=0.5, std_roc=0.01):
dataset_score = []
seeds = [i * 2000 for i in range(1, n_sets + 1)]
for k in range(n_sets):
# Generate data set
seed = seeds[k]
data = get_dataset(sequence_length=40,
n_sequences=50,
motif_length=10,
n_motives=2,
p=p,
random_state=seed)
block_size = data[0]
train_pos_seqs = data[1]
train_neg_seqs = data[2]
test_pos_seqs = data[3]
n_motives = data[4]
true_score = data[5]
smod = SMoDWrapper(alphabet = 'dna',
scoring_criteria = 'pwm',
complexity = 5,
n_clusters = 10,
min_subarray_size = 8,
max_subarray_size = 12,
clusterer = KMeans(),
pos_block_size = block_size,
neg_block_size = block_size,
# sample_size = 300,
p_value = param_setting['p_value'],
similarity_th = param_setting['similarity_th'],
min_score = param_setting['min_score'],
min_freq = param_setting['min_freq'],
min_cluster_size = param_setting['min_cluster_size'],
regex_th = param_setting['regex_th'],
freq_th = param_setting['freq_th'],
std_th = param_setting['std_th'])
try:
smod.fit(train_pos_seqs, train_neg_seqs)
scores = score_seqs(seqs = test_pos_seqs,
n_motives = n_motives,
tool = smod)
except:
continue
mean_score = np.mean(scores, axis=0)
roc_score = roc_auc_score(true_score, mean_score)
# if a parameter setting performs poorly, don't test on other datasets
# z-score = (x - mu)/sigma
# if ((roc_score - max_roc)/std_roc) > 2:
if roc_score < 0.6:
break
dataset_score.append(roc_score)
return dataset_score
In [10]:
def check_validity(key, value, noise):
if key == 'min_score': # atleast greater than (motif_length)/2
if value >= 5:
return True, int(round(value))
elif key == 'min_cluster_size':
if value >= 3:
return True, int(round(value))
elif key == 'min_freq': # atmost (1 - noise_level)
if value > 0 and value <= (1 - noise):
return True, value
elif key == 'p_value':
if value <= 1.0 and value >= 0.0:
return True, value
elif key == 'similarity_th':
if value <= 1.0 and value >= 0.8:
return True, value
elif key == 'regex_th':
if value > 0 and value <= 0.3:
return True, value
elif key == 'freq_th':
if value <= 1.0 and value > 0:
return True, value
elif key == 'std_th':
if value <= 1.0 and value > 0:
return True, value
else:
raise ValueError('Invalid key: ', key)
return False, value
def random_setting(parameters=None, best_config=None, noise=None):
parameter_setting = {}
MAX_ITER = 1000
if not parameters['min_score']: # use best_configuration of last run as initial setting
for key in parameters.keys():
parameters[key].append(best_config[key])
parameter_setting[key] = best_config[key]
else:
for key in parameters.keys():
success = False
n_iter = 0
mu = np.mean(parameters[key])
sigma = np.mean(parameters[key])
if sigma == 0:
sigma == 0.1
while not success:
if n_iter == MAX_ITER: # if max_iterations exceeded, return mean as value
value = mu
if key in ['min_score', 'min_cluster_size']:
value = int(round(value))
break
value = np.random.normal(mu, 2 * sigma)
n_iter += 1
success, value = check_validity(key, value, noise)
parameter_setting[key] = value
return parameter_setting
In [ ]:
%%time
print datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S'),
print "Starting experiment...\n"
best_config = {'min_score':6, # atleast motif_length/2
'min_freq':0.1, # can not be more than (1- noise level)
'min_cluster_size':3, # atleast 3
'p_value':0.1, # atleast 0.1
'similarity_th':0.8, # 0.8
'regex_th':0.3, # max 0.3
'freq_th':0.05, # 0.05
'std_th':0.2} # 0.2
# Final results
param = [0.1, 0.2, 0.3]#, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
results_dic = {}
reps = 30 #0 # different settings to be tried
for i in param:
parameters = {'min_freq': [],
'min_cluster_size': [],
'p_value': [],
'similarity_th': [],
'min_score': [],
'regex_th': [],
'freq_th': [],
'std_th': []}
max_roc = 0.5
std_roc = 0.01
#parameters = generate_dist(parameters, best_config)
for j in range(reps):
param_setting = random_setting(parameters, best_config, i) # Randomize Parameter setting
n_sets = 5 # Different data sets
dataset_score = test_on_datasets(n_sets=n_sets,
param_setting=param_setting,
p=i,
max_roc=max_roc,
std_roc=std_roc)
mean_roc = np.mean(dataset_score)
std = np.std(dataset_score)
if mean_roc > max_roc:
max_roc = mean_roc
std_roc = std
print datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S'),
print "Better Configuration found at perturbation prob = ", i
print "ROC: ", mean_roc
print "Parameter Configuration: ", param_setting
print
best_config = param_setting
param_setting["ROC"] = mean_roc
results_dic[i] = param_setting
print datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S'),
print " Finished experiment..."
In [ ]:
In [ ]: